Lab Assignment Two: Exploring Image Data
Richmond Aisabor
To understand how the galaxies formed, researchers need to be able to classify them according to their shape. This dataset is a sample of the roughly one hundred billion galaxies in the observable universe. Each of these galaxies, containing billions of stars, has had a unique life and has interacted with it's environment in a different way. Through studying the morphology of galaxies, answers to how humans came to exist or the meaning of life become much clearer and easier to locate. In order to relate the different shapes of galaxies to the physical phenomenon that created them, such images must be classified. The overall goal is to understand the natural processes that created the universe by analyzing the formation of galaxies.
To be confident that the algorithm is learning properly, it must have a success rate better than 50%, a random chance. The goal for the algorithm is to be as close to 100% accuracy as possible and the user that benefits most from a successful algorithm is an astronomer. To be considered a successful algorithm, it must have a success rate of atleast 90%.
The dataset is called the Galaxy10 dataset. It contains 21,785 69x69 pixel colored images of galaxies and 10 distinct classes. Galaxy10 images were shot by The Sloan Digital Sky Survey (SDSS) and the classification labels were created by Galaxy Zoo.
Dataset source: https://astronn.readthedocs.io/en/latest/galaxy10.html
from astroNN.datasets import galaxy10
from astroNN.datasets.galaxy10 import galaxy10cls_lookup
from tensorflow.keras import utils
import numpy as np
from matplotlib import pyplot as plt
galaxy_X, galaxy_y = galaxy10.load_data() #load data into ndarray
target = galaxy10.Galaxy10Class
# To convert the labels to categorical 10 classes
galaxy_y = utils.to_categorical(galaxy_y, 10)
# To convert to desirable type
#galaxy_X = galaxy_X.astype(np.float32)
#galaxy_y = galaxy_y.astype(np.float32)
n_samples = galaxy_X.shape[0]
n_classes = len(target)
n_channels = galaxy_X.shape[3]
h = galaxy_X.shape[1]
w = galaxy_X.shape[2]
n_features = h*w
print("n_samples: {}".format(n_samples))
print("n_features: {}".format(n_features))
print("n_classes: {}".format(n_classes))
print("n_channels: {}".format(n_channels))
print("Original Image Sizes {} by {}".format(h,w))
print (n_features) # the size of the images are the size of the feature vectors
/Users/richmondaisabor/.astroNN/datasets/Galaxy10.h5 was found! n_samples: 21785 n_features: 4761 n_classes: 10 n_channels: 3 Original Image Sizes 69 by 69 4761
def plot_gallery(images, titles, n_row=2, n_col=5):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.7 * n_col, 2.3 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i])
plt.title(galaxy10cls_lookup(titles[i]), size=8)
plt.xticks(())
plt.yticks(())
plot_gallery(galaxy_X, galaxy_y)
import pandas as pd
from skimage.color import rgb2gray
X = galaxy_X.flatten().reshape(n_samples, h*w*n_channels)
galaxy_grey = rgb2gray(galaxy_X)
X_grey = galaxy_grey.flatten().reshape(n_samples, h*w)
df = pd.DataFrame(X)
df.head()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14273 | 14274 | 14275 | 14276 | 14277 | 14278 | 14279 | 14280 | 14281 | 14282 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 16 | 18 | 15 | 18 | 20 | 15 | 16 | 17 | 12 | 13 | ... | 1 | 8 | 11 | 3 | 11 | 14 | 6 | 12 | 15 | 7 |
| 1 | 1 | 3 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 0 | 1 | 3 | 0 | 2 | 4 | 1 | 2 | 4 | 1 |
| 2 | 7 | 10 | 7 | 2 | 5 | 2 | 2 | 5 | 1 | 2 | ... | 5 | 7 | 7 | 5 | 3 | 3 | 2 | 2 | 2 | 0 |
| 3 | 9 | 10 | 5 | 6 | 6 | 3 | 6 | 6 | 4 | 6 | ... | 2 | 7 | 9 | 4 | 12 | 14 | 9 | 13 | 15 | 10 |
| 4 | 1 | 3 | 2 | 1 | 3 | 2 | 4 | 6 | 5 | 5 | ... | 0 | 3 | 5 | 2 | 4 | 6 | 3 | 2 | 4 | 1 |
5 rows × 14283 columns
from sklearn.decomposition import PCA
n_components = 500
print ("Extracting the top %d eigenthings from %d images" % (
n_components, n_samples))
pca = PCA(n_components=n_components)
%time pca.fit(X.copy())
eigenthings = pca.components_.reshape((n_components, n_features, n_channels))
Extracting the top 500 eigenthings from 21785 images CPU times: user 3min 14s, sys: 10.6 s, total: 3min 25s Wall time: 2min 9s
# manipulated from Sebastian Raschka Example (your book!)
# also from hi blog here: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html
def plot_explained_variance(pca):
import plotly
from plotly.graph_objs import Bar, Line
from plotly.graph_objs import Scatter, Layout
from plotly.graph_objs.scatter import Marker
from plotly.graph_objs.layout import XAxis, YAxis
plotly.offline.init_notebook_mode() # run at the start of every notebook
explained_var = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(explained_var)
plotly.offline.iplot({
"data": [Bar(y=explained_var, name='individual explained variance'),
Scatter(y=cum_var_exp, name='cumulative explained variance')
],
"layout": Layout(xaxis=XAxis(title='Principal components'), yaxis=YAxis(title='Explained variance ratio'))
})
plot_explained_variance(pca)
After running principle component anlaysis, only 500 principle components is needed to account for 95% of the variance between the images. To addequately represent the data, only 500 principle components are necessary. The explained variance plot after running PCA with 2000 components shows that 2000 principle components accounts for 99% of the variance between the images. After 500 principle components, the more principle components added, the more reduancy is added to the analysis. Therefore 500 principle components is needed to represent the images.
eigenthings_titles = ["eigenthing %d" % i for i in range(eigenthings.shape[0])]
def plot_gallery2(images, titles, n_row=2, n_col=5):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.7 * n_col, 2.3 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(((images[i].reshape(h, w, n_channels))* 255).astype(np.uint8))
plt.title(titles[i], size=8)
plt.xticks(())
plt.yticks(())
plot_gallery2(eigenthings, eigenthings_titles)
n_components = 500
print ("Extracting the top %d eigenthings from %d images" % (
n_components, X.shape[0]))
rpca = PCA(n_components=n_components, svd_solver='randomized')
%time rpca.fit(X.copy())
eigenthings = rpca.components_.reshape((n_components, n_features, n_channels))
Extracting the top 500 eigenthings from 21785 images CPU times: user 3min 26s, sys: 4.96 s, total: 3min 31s Wall time: 1min 52s
plot_explained_variance(rpca)
eigenthings_titles = ["eigenthing %d" % i for i in range(eigenthings.shape[0])]
plot_gallery2(eigenthings, eigenthings_titles)
labels = ['PCA', 'Randomized PCA']
time = [3, 3]
explain_var = [0.96, 0.96]
x = np.arange(len(labels)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots(figsize=(8,6))
ax.set_ylabel('Time (in minutes)')
ax.bar(x - width/2, time, width, label='Time', align="center", color = "blue")
ax2 = ax.twinx()
ax2.set_ylabel('Cumulative Explained Variance')
ax2.bar(x + width/2, explain_var, width, label='Explained Variance', align="center", color = "orange")
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend(loc="best")
ax2.legend(loc = "upper left")
fig.tight_layout()
The preferred method is either PCA or randomized PCA. This is because the time it takes to run a PCA is the same as a randomized PCA. Given it takes the same amount of time, it is safe to conclude that either PCA methods will represent the images. If there are more components, then it will take longer to perform a PCA. Since the amount of principle components for randomized PCA and PCA is 500, then PCA is running as efficiently as randomized PCA
from ipywidgets import fixed
import copy
from sklearn.metrics.pairwise import pairwise_distances
# find the pairwise distance between all the different image features
X_pca_features = rpca.transform(copy.deepcopy(X))
dist_matrix_pca = pairwise_distances(copy.deepcopy(X_pca_features),
metric="seuclidean")
from skimage.io import imshow
from skimage.filters import sobel_h, sobel_v
plt.subplot(1,2,1)
idx_to_reconstruct = int(np.random.rand(1)*len(X))
img = X_grey[idx_to_reconstruct].reshape((h,w))
imshow(img)
plt.grid(False)
plt.subplot(1,2,2)
gradient_mag = np.sqrt(sobel_v(img)**2 + sobel_h(img)**2 )
imshow(gradient_mag)
plt.grid(False)
from skimage.feature import daisy
features, img_desc = daisy(img,
step=40,
radius=20,
rings=2,
histograms=8,
orientations=8,
visualize=True)
imshow(img_desc)
plt.grid(False)
print(features.shape)
print(features.shape[0]*features.shape[1]*features.shape[2])
(1, 1, 136) 136
def apply_daisy(row, shape):
feat = daisy(row.reshape(shape), step=20, radius=20,
rings=2, histograms=8, orientations=4,
visualize=False)
return feat.reshape((-1))
%time test_feature = apply_daisy(X_grey[3], (h,w))
test_feature.shape
CPU times: user 8.36 ms, sys: 2.08 ms, total: 10.4 ms Wall time: 9.63 ms
(272,)
%time daisy_features = np.apply_along_axis(apply_daisy, 1, X_grey, (h,w))
print(X_grey.shape)
print(daisy_features.shape)
CPU times: user 2min 14s, sys: 3.04 s, total: 2min 17s Wall time: 2min 20s (21785, 4761) (21785, 272)
from sklearn.metrics.pairwise import pairwise_distances
# find the pairwise distance between all the different image features
%time dist_matrix = pairwise_distances(daisy_features)
CPU times: user 16.8 s, sys: 5.88 s, total: 22.7 s Wall time: 21.5 s
import ipywidgets as widgets
from ipywidgets import fixed
# put it together inside a nice widget
def closest_image(dmat_daisy, dmat_pca, idx1):
distances = copy.deepcopy(dmat_daisy[idx1,:]) # get all image diatances
distances[idx1] = np.infty # dont pick the same image!
idx2 = np.argmin(distances)
distances = copy.deepcopy(dmat_pca[idx1,:]) # get all image diatances
distances[idx1] = np.infty # dont pick the same image!
idx3 = np.argmin(distances)
plt.figure(figsize=(10,25))
plt.subplot(1,3,1)
imshow(X[idx1].reshape((h,w,n_channels)))
plt.title("Original:"+ galaxy10cls_lookup(galaxy_y[idx1]))
plt.grid()
plt.subplot(1,3,2)
imshow(X_grey[idx2].reshape((h,w)))
plt.title("DAISY Closest:"+galaxy10cls_lookup(galaxy_y[idx2]))
plt.grid()
plt.subplot(1,3,3)
imshow(X[idx3].reshape((h,w,n_channels)))
plt.title("PCA Closest:"+galaxy10cls_lookup(galaxy_y[idx3]))
plt.grid()
widgets.interact(closest_image,idx1=(0,n_samples-1,1),
dmat_daisy=fixed(dist_matrix),
dmat_pca=fixed(dist_matrix_pca),
__manual=True)
<function __main__.closest_image(dmat_daisy, dmat_pca, idx1)>
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
knn_pca = KNeighborsClassifier(n_neighbors=1)
knn_dsy = KNeighborsClassifier(n_neighbors=1)
pca_train, pca_test, dsy_train, dsy_test, y_train, y_test = train_test_split(
X_pca_features,daisy_features,galaxy_y,test_size=0.2, train_size=0.8)
knn_pca.fit(pca_train,y_train)
acc_pca = accuracy_score(knn_pca.predict(pca_test),y_test)
knn_dsy.fit(dsy_train,y_train)
acc_dsy = accuracy_score(knn_dsy.predict(dsy_test),y_test)
plt.subplots(figsize=(8,6))
labels = ["PCA", "Daisy"]
acc = [acc_pca, acc_dsy]
colors = ['r','b']
plt.bar(labels, acc, align='center', width=0.4, color=colors)
<BarContainer object of 2 artists>
After find the accuracy scores for each feature extraction, it is apparent that PCA feature extraction is the most accurate at predicting images in the galaxy dataset. This PCA algorithm considers 500 principle components and the DAISY contains only 272 features. The increased accuracy of PCA is a result of a larger feature space.